library(vcfR)
## Warning: package 'vcfR' was built under R version 3.5.2
##
## ***** *** vcfR *** *****
## This is vcfR 1.9.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(ggplot2)
library(gridExtra)
library(ggridges)
## Warning: package 'ggridges' was built under R version 3.5.2
library(adegenet)
## Loading required package: ade4
## Warning: package 'ade4' was built under R version 3.5.2
##
## /// adegenet 2.1.3 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
Read in your unfiltered vcf file
#read in vcf as vcfR
vcfR <- read.vcfR("~/Desktop/aph.data/populations.snps.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 210336
## column count: 124
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 210336
## Character matrix gt cols: 124
## skip: 0
## nrows: 210336
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant 48000
Processed variant 49000
Processed variant 50000
Processed variant 51000
Processed variant 52000
Processed variant 53000
Processed variant 54000
Processed variant 55000
Processed variant 56000
Processed variant 57000
Processed variant 58000
Processed variant 59000
Processed variant 60000
Processed variant 61000
Processed variant 62000
Processed variant 63000
Processed variant 64000
Processed variant 65000
Processed variant 66000
Processed variant 67000
Processed variant 68000
Processed variant 69000
Processed variant 70000
Processed variant 71000
Processed variant 72000
Processed variant 73000
Processed variant 74000
Processed variant 75000
Processed variant 76000
Processed variant 77000
Processed variant 78000
Processed variant 79000
Processed variant 80000
Processed variant 81000
Processed variant 82000
Processed variant 83000
Processed variant 84000
Processed variant 85000
Processed variant 86000
Processed variant 87000
Processed variant 88000
Processed variant 89000
Processed variant 90000
Processed variant 91000
Processed variant 92000
Processed variant 93000
Processed variant 94000
Processed variant 95000
Processed variant 96000
Processed variant 97000
Processed variant 98000
Processed variant 99000
Processed variant 100000
Processed variant 101000
Processed variant 102000
Processed variant 103000
Processed variant 104000
Processed variant 105000
Processed variant 106000
Processed variant 107000
Processed variant 108000
Processed variant 109000
Processed variant 110000
Processed variant 111000
Processed variant 112000
Processed variant 113000
Processed variant 114000
Processed variant 115000
Processed variant 116000
Processed variant 117000
Processed variant 118000
Processed variant 119000
Processed variant 120000
Processed variant 121000
Processed variant 122000
Processed variant 123000
Processed variant 124000
Processed variant 125000
Processed variant 126000
Processed variant 127000
Processed variant 128000
Processed variant 129000
Processed variant 130000
Processed variant 131000
Processed variant 132000
Processed variant 133000
Processed variant 134000
Processed variant 135000
Processed variant 136000
Processed variant 137000
Processed variant 138000
Processed variant 139000
Processed variant 140000
Processed variant 141000
Processed variant 142000
Processed variant 143000
Processed variant 144000
Processed variant 145000
Processed variant 146000
Processed variant 147000
Processed variant 148000
Processed variant 149000
Processed variant 150000
Processed variant 151000
Processed variant 152000
Processed variant 153000
Processed variant 154000
Processed variant 155000
Processed variant 156000
Processed variant 157000
Processed variant 158000
Processed variant 159000
Processed variant 160000
Processed variant 161000
Processed variant 162000
Processed variant 163000
Processed variant 164000
Processed variant 165000
Processed variant 166000
Processed variant 167000
Processed variant 168000
Processed variant 169000
Processed variant 170000
Processed variant 171000
Processed variant 172000
Processed variant 173000
Processed variant 174000
Processed variant 175000
Processed variant 176000
Processed variant 177000
Processed variant 178000
Processed variant 179000
Processed variant 180000
Processed variant 181000
Processed variant 182000
Processed variant 183000
Processed variant 184000
Processed variant 185000
Processed variant 186000
Processed variant 187000
Processed variant 188000
Processed variant 189000
Processed variant 190000
Processed variant 191000
Processed variant 192000
Processed variant 193000
Processed variant 194000
Processed variant 195000
Processed variant 196000
Processed variant 197000
Processed variant 198000
Processed variant 199000
Processed variant 200000
Processed variant 201000
Processed variant 202000
Processed variant 203000
Processed variant 204000
Processed variant 205000
Processed variant 206000
Processed variant 207000
Processed variant 208000
Processed variant 209000
Processed variant 210000
Processed variant: 210336
## All variants processed
### check the metadata present in your vcf
vcfR
## ***** Object of Class vcfR *****
## 115 samples
## 87 CHROMs
## 210,336 variants
## Object size: 685.5 Mb
## 57.1 percent missing data
## ***** ***** *****
vcfR@fix[1:10,1:8]
## CHROM POS ID REF ALT QUAL FILTER INFO
## [1,] "Pseudochr1" "615693" "48:25:+" "T" "C" NA "PASS" "NS=3;AF=0.333"
## [2,] "Pseudochr1" "615724" "48:56:+" "C" "T" NA "PASS" "NS=3;AF=0.333"
## [3,] "Pseudochr1" "674543" "56:43:+" "A" "T" NA "PASS" "NS=9;AF=0.222"
## [4,] "Pseudochr1" "674545" "56:45:+" "A" "T" NA "PASS" "NS=18;AF=0.222"
## [5,] "Pseudochr1" "674596" "56:96:+" "A" "T" NA "PASS" "NS=14;AF=0.250"
## [6,] "Pseudochr1" "690980" "61:7:+" "G" "A" NA "PASS" "NS=10;AF=0.100"
## [7,] "Pseudochr1" "690991" "61:18:+" "T" "C" NA "PASS" "NS=10;AF=0.300"
## [8,] "Pseudochr1" "691006" "61:33:+" "T" "A" NA "PASS" "NS=10;AF=0.300"
## [9,] "Pseudochr1" "690931" "62:45:-" "G" "A" NA "PASS" "NS=4;AF=0.250"
## [10,] "Pseudochr1" "690906" "62:70:-" "A" "G" NA "PASS" "NS=4;AF=0.250"
vcfR@gt[1:10,1:2]
## FORMAT A_californica_333849
## [1,] "GT:DP:AD:GQ:GL" NA
## [2,] "GT:DP:AD:GQ:GL" NA
## [3,] "GT:DP:AD:GQ:GL" NA
## [4,] "GT:DP:AD:GQ:GL" "1/1:67:0,67:40:-256.40,-21.43,0.00"
## [5,] "GT:DP:AD:GQ:GL" "1/1:67:0,67:40:-226.28,-20.45,0.00"
## [6,] "GT:DP:AD:GQ:GL" "0/0:2:2,0:32:-0.00,-2.62,-6.23"
## [7,] "GT:DP:AD:GQ:GL" "1/1:2:0,2:27:-4.96,-2.15,-0.00"
## [8,] "GT:DP:AD:GQ:GL" "0/0:2:2,0:31:-0.00,-2.51,-5.68"
## [9,] "GT:DP:AD:GQ:GL" NA
## [10,] "GT:DP:AD:GQ:GL" NA
#generate popmap file. Two column popmap with the same format as stacks, and the columns must be named 'id' and 'pop'
popmap<-data.frame(id=colnames(vcfR@gt)[2:length(colnames(vcfR@gt))],pop=substr(colnames(vcfR@gt)[2:length(colnames(vcfR@gt))], 3,11))
Note:If you have a very large vcf output file that you would like to work with in Rstudio, you could implement some percentage based filters (e.g. remove all SNPs above 50% missing data) via vcftools to reduce your filesize before starting to filter in R. Just be aware that once you drop low data samples, your previously enforced (per SNP or locus) missing data % will no longer be exact.
Jon Puritz has an excellent filtering tutorial that is focused specifically on filtering RADseq data: datahttps://www.ddocent.com/filtering/ Some of these RAD specific filters can't be applied to a vcf output by stacks, which doesn't retain metadata like mapping quality and strand, but we can follow these guidelines for hard filtering (he suggests minimum depth=3, gq =30), and can implement suggested filters like allele balance and max depth, here in R.
#do hard filtering with this function:
hard.filter.vcf <- function(vcfR, depth=NULL, gq=NULL){
#extract depth from the vcf
dp.matrix<- extract.gt(vcfR, element='DP', as.numeric=TRUE)
#calculate the SNPs that fall below the depth filter
i<-round((sum(dp.matrix < depth, na.rm = TRUE)/sum(!is.na(dp.matrix)))*100, 2)
#report filter
print(paste0(i,"% of genotypes fall below a read depth of ",depth," and were converted to NA"))
#convert to NAs
dp.matrix[dp.matrix < depth] <- NA
vcfR@gt[,-1][ is.na(dp.matrix) == TRUE ] <- NA
#extract gq from the vcf
gq.matrix<- extract.gt(vcfR, element='GQ', as.numeric=TRUE)
#calculate the SNPs that fall below the gq filter
j<-round((sum(gq.matrix < gq, na.rm = TRUE)/sum(!is.na(gq.matrix)))*100, 2)
#report filter
print(paste0(j,"% of genotypes fall below a genotype quality of ",gq," and were converted to NA"))
#convert to NAs
gq.matrix[gq.matrix < gq] <- NA
vcfR@gt[,-1][ is.na(gq.matrix) == TRUE ] <- NA
return(vcfR)
}
#hard filter to minimum depth of 5, and minimum genotype quality of 30
vcfR<-hard.filter.vcf(vcfR=vcfR, depth = 3, gq = 30)
## [1] "20.24% of genotypes fall below a read depth of 3 and were converted to NA"
## [1] "7.91% of genotypes fall below a genotype quality of 30 and were converted to NA"
Use this function to filter for allele balance from Puritz SNP filtering tutorial "Allele balance: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous, we expect that the allele balance in our data (for real loci) should be close to 0.5"
filter.allele.balance <- function(vcfR){
#extract AD from the vcf
ad.matrix<- extract.gt(vcfR, element='AD')
#extract GT from the vcf
gt.matrix<- extract.gt(vcfR, element='GT')
#mask dp matrix to include only called hets from gt matrix
ad.matrix[gt.matrix != "0/1"]<-NA
#split allele 1 depth from allele 2 depth
al1<-structure(as.numeric(gsub(",.*", "", ad.matrix)), dim=dim(ad.matrix))
al2<-structure(as.numeric(gsub(".*,", "", ad.matrix)), dim=dim(ad.matrix))
#calculate percentage of hets failing AB filter
AB<-al1/(al1 + al2) > .75 | al1/(al1 + al2) <.25
p<-round(sum(AB, na.rm = TRUE) / sum(is.na(AB) == FALSE)*100, 2)
j<-round(sum(AB, na.rm = TRUE) / sum(is.na(gt.matrix) == FALSE)*100, 2)
print(paste0(p,"% of het genotypes (",j,"% of all genotypes) fall outside of .25 - .75 allele balance and were converted to NA"))
#convert failing genotypes to NA
vcfR@gt[,-1][AB]<-NA
return(vcfR)
}
#execute allele balance filter
vcfR<-filter.allele.balance(vcfR)
## [1] "7.13% of het genotypes (0.35% of all genotypes) fall outside of .25 - .75 allele balance and were converted to NA"
max depth filter (super high depth loci are likely multiple loci stuck together into a single paralogous locus). Note: we want to apply this 'per SNP' rather than 'per genotype' otherwise we will simply be removing most of the genotypes from our deepest sequenced samples (because sequencing depth is so variable between samples)
#function to vis and filter by maximum depth
max.depth <- function(vcfR, maxdepth=NULL){
#extract depth from the vcf
dp.matrix<- extract.gt(vcfR, element='DP', as.numeric=TRUE)
#calculate vector of depth per SNP
snpdepth<-rowSums(dp.matrix, na.rm = TRUE)/rowSums(is.na(dp.matrix) == FALSE)
if (is.null(maxdepth)){
#plot histogram of depth
hist(snpdepth, xlab = "mean of the depth of all samples successfully genotyped at a given SNP", main =NULL)
abline(v=mean(snpdepth, na.rm = TRUE), col="red", lty="dashed")
#zoomed in histogram
hist(snpdepth[snpdepth < 200], xlab = "mean of the depth of all samples successfully genotyped at a given SNP", main ="visualize distribution of SNPs below a depth of 200")
abline(v=mean(snpdepth, na.rm = TRUE), col="red", lty="dashed")
#print
print(paste0("dashed line indicates a mean depth across all SNPs of ",round(mean(snpdepth, na.rm = TRUE),1)))
}
else {
#plot the maxdepth cutoff
hist(snpdepth, xlab = "mean of the depth of all samples successfully genotyped at a given SNP", main ="max depth cutoff")
abline(v=maxdepth, col="red")
#calculate % of snps that fail the max depth filter
i<-round(sum(snpdepth > maxdepth, na.rm = TRUE)/length(snpdepth)*100, 2)
print(paste0(i, "% of SNPs were above a mean depth of ", maxdepth, " and were removed from the vcf"))
#filter vcf
vcfR<-vcfR[snpdepth < maxdepth,]
return(vcfR)
}
}
#visualize and pick appropriate max depth cutoff
max.depth(vcfR)
## [1] "dashed line indicates a mean depth across all SNPs of 37.7"
#filter vcf by the max depth cutoff you chose
vcfR<-max.depth(vcfR, maxdepth = 100)
## [1] "12.53% of SNPs were above a mean depth of 100 and were removed from the vcf"
Note: It may be a good idea to additionally filter out SNPs that are significantly out of HWE if you have a really good idea of what the population structure in your sample looks like and good sample sizes in each pop. For this dataset, which is highly structured (many isolated island pops) with species boundaries that are in flux, I am not going to use a HWE filter, because I don't feel comfortable confidently identifying populations in which we can expect HWE.
#check vcfR to see how many SNPs we have left
vcfR
## ***** Object of Class vcfR *****
## 115 samples
## 85 CHROMs
## 183,980 variants
## Object size: 422.7 Mb
## 76.07 percent missing data
## ***** ***** *****
Determining which samples to retain is highly project specific, and is contingent on your sampling, your taxa, the sequencing results, etc. It is also wise to take missing data into account by population. Often we don't know a-priori what our populations will look like, but in general we want to avoid making inferences about populations if they have consistently less information than the rest of our dataset. On the flip side of that coin, you may have designed a project to get information about a rare sample/population that lacks existing genetic sampling, and therefore must retain specific samples despite low sequencing and high missing data. This is a reality, and simply needs to be addressed carefully.
#define function to visualize missing data
missing.by.sample <- function(vcfR, popmap=NULL, cutoff=NULL){
#extract depth from the vcf
dp<- extract.gt(vcfR, element='DP', as.numeric=TRUE)
if (is.null(cutoff)){
if (is.null(popmap)) {
print("No popmap provided")
}
else {
#calculate missingness by pop here and make dotplot
#popmap must be a two column dataframe with 'id' and 'pop' columns
#id's must match the ids in the vcf file
#calculate missingness by individual
miss<-colSums(is.na(dp))/nrow(dp)
#calculate avg depth by individual
avg.depth<-colMeans(dp, na.rm = TRUE)
#store ordered column names
samples<-colnames(dp)
#create df
df.x<-data.frame(id=samples,missingness=miss,avg.depth=avg.depth, row.names = NULL)
#bring in popmap
df.f<-merge(df.x, popmap, by="id")
#plot missingness and depth by pop
plot1<-ggplot(df.f, aes(x= reorder(pop, -missingness), y=missingness)) +
geom_violin()+ theme_classic()+
theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 60, hjust = 1))+
ylab("proportion missing")+
geom_dotplot(binaxis='y', stackdir='center', dotsize=.8)
#dotplot avg depth of sequencing
plot2<-ggplot(df.f, aes(x= reorder(pop, -missingness), y=avg.depth)) +
geom_violin()+ theme_classic()+
theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 60, hjust = 1))+
ylab("average depth")+
geom_dotplot(binaxis='y', stackdir='center', dotsize=.8)
#
grid.arrange(plot1,plot2)
}
#initialize dataframe
df.y<- data.frame(indiv=character(), snps.retained=numeric(), filt=numeric())
#loop
for (i in c(.5,.6,.7,.8,.9,1)){
#get vector of individuals we are looping over
indiv<-colnames(dp)
#calculate the completeness cutoff for each snp to be retained in this iteration
filt<-rep(i, times =ncol(dp))
#calculate the number of loci successfully genotyped in each sample in this iteration
snps.retained<-colSums(is.na(dp[(rowSums(is.na(dp))/ncol(dp) <= 1-i),]) == "FALSE")
#append the data from this iteration to existing df
df.y<-rbind(df.y, as.data.frame(cbind(indiv, filt, snps.retained)))
#close for loop
}
#make columns numeric for plotting
df.y$filt<-as.numeric(as.character(df.y$filt))
df.y$snps.retained<-as.numeric(as.character(df.y$snps.retained))
#visualize color coded by individual
print(
ggplot(df.y, aes(x=filt, y=snps.retained, color=indiv))+
geom_point()+
ggtitle("SNPs retained by filtering scheme") +
xlab("fraction of non-missing genotypes required to retain each SNP (0-1)") + ylab("non-missing SNPs retained per sample")+
theme_light()+
theme(legend.position="none")
)
#calculate missingness by individual
miss<-colSums(is.na(dp))/nrow(dp)
#show plot with missingness by sample
dotchart(sort(miss), cex=.5, xlab = "proportion missing data")
abline(v=c(.5,.6,.7,.8,.9,1), lty="dashed")
#return the dataframe showing the missingness and avg depth per individual
return(df.x)
}
else {
#calculate missingness by individual
miss<-colSums(is.na(dp))/nrow(dp)
#vis plot to show where cutoff was set
dotchart(sort(miss), cex=.5)
abline(v=cutoff, col="red")
#print
print(paste0(length(labels(miss)[miss > cutoff])," samples fall below ",cutoff," missing data cutoff, and were removed from VCF"))
#drop those individuals from vcfR
vcfR@gt <- vcfR@gt[,!(colnames(vcfR@gt) %in% labels(miss)[miss > cutoff])]
return(vcfR)
}
}
#run function to visualize samples
missing.by.sample(vcfR=vcfR, popmap = popmap)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## id missingness avg.depth
## 1 A_californica_333849 0.5809381 47.460629
## 2 A_californica_333854 0.7109631 20.255524
## 3 A_californica_333855 0.7393304 17.652508
## 4 A_californica_333857 0.5841070 50.891461
## 5 A_californica_333860 0.6818295 22.429506
## 6 A_californica_333862 0.7179530 37.782024
## 7 A_californica_333868 0.9849223 4.482696
## 8 A_californica_333871 0.8305685 17.587514
## 9 A_californica_333873 0.8739591 12.737893
## 10 A_californica_333874 0.7454506 30.152204
## 11 A_californica_333914 0.8313947 10.830754
## 12 A_californica_333917 0.6882705 23.255353
## 13 A_californica_333925 0.9996250 3.695652
## 14 A_californica_333931 0.7365801 17.533736
## 15 A_californica_333932 0.6896347 23.266580
## 16 A_californica_333934 0.6658278 24.313853
## 17 A_californica_334002 0.6001685 35.250432
## 18 A_californica_334012 0.5226220 64.546045
## 19 A_californica_334015 0.6171106 30.671824
## 20 A_californica_334017 0.7464453 16.327188
## 21 A_californica_334019 0.8017284 11.815560
## 22 A_californica_334171 0.6030221 16.198875
## 23 A_californica_342037 0.7796065 13.047425
## 24 A_californica_342048 0.6896402 22.126865
## 25 A_californica_342050 0.4996684 80.175262
## 26 A_californica_342051 0.9878574 4.436885
## 27 A_californica_342066 0.6655506 27.582412
## 28 A_californica_342072 0.8309599 10.948907
## 29 A_californica_343428 0.7406512 32.506507
## 30 A_californica_343430 0.7791119 13.730554
## 31 A_californica_343432 0.7845472 13.450869
## 32 A_californica_343438 0.8222579 11.022354
## 33 A_californica_343442 0.9534243 5.245886
## 34 A_californica_343451 0.8150179 9.722622
## 35 A_californica_393615 0.8199913 17.652878
## 36 A_californica_393616 0.8975106 7.584801
## 37 A_californica_393721 0.7483640 34.910359
## 38 A_coerulescens_396251 0.6605338 34.400897
## 39 A_coerulescens_396254 0.7271443 20.862550
## 40 A_coerulescens_396256 0.8125122 12.535397
## 41 A_coerulescens_396259 0.9441189 5.769380
## 42 A_coerulescens_396262 0.9999891 5.500000
## 43 A_coerulescens_396263 0.8487988 9.943382
## 44 A_coerulescens_396264 0.6134471 38.628997
## 45 A_coerulescens_396265 0.6056908 48.729244
## 46 A_insularis_334025 0.7168062 20.626483
## 47 A_insularis_334029 0.7269758 20.290538
## 48 A_insularis_334031 0.6352049 37.116367
## 49 A_insularis_334032 0.5710621 50.279880
## 50 A_insularis_334033 0.5622622 54.169218
## 51 A_insularis_334034 0.6500272 32.849863
## 52 A_insularis_334037 0.6662518 28.487908
## 53 A_insularis_334038 0.8355636 10.730671
## 54 A_sumichrasti_343502 0.7428253 17.215513
## 55 A_sumichrasti_343503 0.4782096 107.192065
## 56 A_sumichrasti_343510 0.7773562 28.200356
## 57 A_sumichrasti_343511 0.8241168 20.615656
## 58 A_sumichrasti_343512 0.5619850 50.354441
## 59 A_sumichrasti_343513 0.9140124 7.297661
## 60 A_sumichrasti_343514 0.7721111 13.773654
## 61 A_sumichrasti_343515 0.7607620 15.324367
## 62 A_sumichrasti_393633 0.9504783 5.517945
## 63 A_sumichrasti_393635 0.7636591 15.489513
## 64 A_sumichrasti_393636 0.8851506 7.285471
## 65 A_sumichrasti_393637 0.9673497 5.549193
## 66 A_sumichrasti_393638 0.8132243 19.724500
## 67 A_sumichrasti_393639 0.9754539 4.837467
## 68 A_sumichrasti_393640 0.7032775 21.525654
## 69 A_woodhouseii_334059 0.9984509 3.771930
## 70 A_woodhouseii_334062 0.7885259 13.841751
## 71 A_woodhouseii_334063 0.7481791 17.659227
## 72 A_woodhouseii_334086 0.7740896 15.519380
## 73 A_woodhouseii_334088 0.7515871 15.766055
## 74 A_woodhouseii_334096 0.8530873 8.660143
## 75 A_woodhouseii_334132 0.6810469 23.467204
## 76 A_woodhouseii_334133 0.6624416 28.706847
## 77 A_woodhouseii_334134 0.8565388 9.493256
## 78 A_woodhouseii_334142 0.6695891 27.243794
## 79 A_woodhouseii_334148 0.8917872 7.688734
## 80 A_woodhouseii_334153 0.7401076 17.100094
## 81 A_woodhouseii_334156 0.8992988 10.604145
## 82 A_woodhouseii_334161 0.5311936 67.928662
## 83 A_woodhouseii_334170 0.9219372 8.109525
## 84 A_woodhouseii_334172 0.9990108 4.538462
## 85 A_woodhouseii_334188 0.7860039 13.432653
## 86 A_woodhouseii_334190 0.6981737 22.919323
## 87 A_woodhouseii_334196 0.6566366 28.788498
## 88 A_woodhouseii_334210 0.6485488 29.604129
## 89 A_woodhouseii_334211 0.6381183 32.132775
## 90 A_woodhouseii_334217 0.8472443 7.879946
## 91 A_woodhouseii_334230 0.7158441 20.419174
## 92 A_woodhouseii_334240 0.9901294 4.274780
## 93 A_woodhouseii_334241 0.8070660 12.252705
## 94 A_woodhouseii_334242 0.8080987 11.744548
## 95 A_woodhouseii_334243 0.7697956 13.304347
## 96 A_woodhouseii_334244 0.6248940 34.955327
## 97 A_woodhouseii_334247 0.8122242 11.948187
## 98 A_woodhouseii_334250 0.6274649 30.675951
## 99 A_woodhouseii_343453 0.9564192 4.996009
## 100 A_woodhouseii_343458 0.8279433 10.529237
## 101 A_woodhouseii_343461 0.6208773 41.288383
## 102 A_woodhouseii_343476 0.7125448 20.503895
## 103 A_woodhouseii_343480 0.8228721 16.833957
## 104 A_woodhouseii_343481 0.7838896 22.375126
## 105 A_woodhouseii_343483 0.6541091 44.614752
## 106 A_woodhouseii_343497 0.7750734 12.092214
## 107 A_woodhouseii_393605 0.6664746 45.215622
## 108 A_woodhouseii_393606 0.7259267 19.693321
## 109 A_woodhouseii_393697 0.7779541 25.888916
## 110 A_woodhouseii_393698 0.8717306 13.045934
## 111 A_woodhouseii_393699 0.9589412 5.666137
## 112 A_woodhouseii_393702 0.7676921 27.260482
## 113 A_woodhouseii_393712 0.6736113 43.000716
## 114 A_woodhouseii_393713 0.8483042 14.840159
## 115 A_woodhouseii_395768 0.6103109 69.415664
#run function to drop samples above the threshold we want from the vcf
vcfR<-missing.by.sample(vcfR=vcfR, cutoff = .93)
## [1] "14 samples fall below 0.93 missing data cutoff, and were removed from VCF"
#subset popmap to only include retained individuals
popmap<-popmap[popmap$id %in% colnames(vcfR@gt),]
#alternatively, you can drop individuals from vcfR manually using the following syntax, if a strict cutoff doesn't work for your dataset
#vcfR@gt <- vcfR@gt[,colnames(vcfR@gt) != "E_trichroa_4702" & colnames(vcfR@gt) != "E_trichroa_27882"]
We can visualize the effect that typical missing data cutoffs will have on both the number of SNPs retained and the total missing data in our entire dataset. We want to choose a cutoff that minimizes the overall missing data in the dataset, while maximizing the total number of loci retained. Note: This will also depend on the samples that we decided to include above. A good rule of thumb is that samples shouldn't be above 50% missing data after applying this cutoff. So if we are retaining low data samples out of necessity or project design, we may have to set a more stringent cutoff at the expense of total SNPs retained for downstream analyses.
#define function
missing.by.snp <- function(vcfR, cutoff=NULL){
if (!is.null(cutoff)) {
#do basic vis to show cutoff
#extract genotype from the vcf
dp.matrix<- extract.gt(vcfR, as.numeric=TRUE)
#calculate the proportion of individuals successfully genotyped at each SNP
miss<-rowSums(is.na(dp.matrix))/ncol(dp.matrix)
#loop that stores a vector of # non-missing loci retained for each individual
#looped over all possible completeness filter values
#initialize df.x
df.x<- data.frame(filt=numeric(), missingness=numeric(), snps.retained=numeric())
#loop
for (i in c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1)){
#subset the matrix to only snps passing a given filter level
gen.z.mat<-dp.matrix[1-miss >= i,]
#calc the total number of snps retained at the given filter level
snps.retained<-nrow(gen.z.mat)
#calculate the total missing data % for the given cutoff
missingness<-sum(is.na(gen.z.mat))/(ncol(gen.z.mat)*nrow(gen.z.mat))
#calculate the completeness cutoff for each snp to be retained
filt<-i
df.x<-rbind(df.x, as.data.frame(cbind(filt, missingness, snps.retained)))
}
#make columns numeric for plotting
df.x$filt<-as.numeric(as.character(df.x$filt))
df.x$missingness<-as.numeric(as.character(df.x$missingness))
df.x$snps.retained<-as.numeric(as.character(df.x$snps.retained))
#visualize dotplot for total loci retained at each filter level
plot1<-ggplot(df.x, aes(x=filt)) +
scale_x_continuous(breaks=c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1))+
geom_point(aes(y=snps.retained)) +
theme_bw() +
labs(x = "SNP completeness cutoff", y = "total loci retained")+
geom_vline(xintercept = cutoff, color = "red")
#visualize dotplot for missing data % at each filter level
plot2<-ggplot(df.x, aes(x=filt)) +
scale_x_continuous(breaks=c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1))+
geom_point(aes(y=missingness)) +
theme_bw() +
labs(x = "SNP completeness cutoff", y = "total % missing data")+
geom_vline(xintercept = cutoff, color = "red")
grid.arrange(plot1,plot2)
#calc # of SNPs filtered
p<-round(sum(miss > 1-cutoff)/length(miss)*100, 2)
#report # of SNPs filtered
print(paste0(p,"% of SNPs fell below a completeness cutoff of ", cutoff, " and were removed from the VCF"))
#do filtering
vcfR <- vcfR[miss <= 1-cutoff, ]
return(vcfR)
}
else {
###Part 1
#Vis to understand the interplay between retaining samples and setting a missing data cutoff
#extract genotype from the vcf
dp.matrix<- extract.gt(vcfR, as.numeric=TRUE)
#calculate vector containing the proportion of individuals successfully genotyped at each SNP
miss<-rowSums(is.na(dp.matrix))/ncol(dp.matrix)
#initialize df.x
df.y<- data.frame(filt=numeric(), snps=numeric())
#loop
for (i in c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1)){
#subset matrix to only SNPs retained at given filter level
gen.z.mat<-dp.matrix[1-miss >= i,]
#calc the total number of snps retained at the given filter level in each sample
snps<-colSums(is.na(gen.z.mat) == TRUE)/nrow(gen.z.mat)
#calculate the completeness cutoff for each snp to be retained
filt<-rep(i, times= length(snps))
df.y<-rbind(df.y, as.data.frame(cbind(filt, snps)))
}
#make columns numeric for plotting
df.y$filt<-as.numeric(as.character(df.y$filt))
df.y$snps<-as.numeric(as.character(df.y$snps))
#visualize filtering levels as stacked histograms
print(
ggplot(df.y, aes(x = snps, y = as.character(filt), fill = filt, color = filt)) +
geom_density_ridges(jittered_points = TRUE, position = "raincloud", alpha = .25, cex=.5) +
theme_classic() +
labs(x = "missing data proportion in each sample", y = "SNP completeness cutoff") +
theme(legend.position = "none")
)
###Part 2
#Vis to make decision on cutoff
#calculate the proportion of individuals successfully genotyped at each SNP
miss<-rowSums(is.na(dp.matrix))/ncol(dp.matrix)
#loop that stores a vector of # non-missing loci retained for each individual
#looped over all possible completeness filter values
#initialize df.x
df.x<- data.frame(filt=numeric(), missingness=numeric(), snps.retained=numeric())
#loop
for (i in c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1)){
#subset the matrix to only snps passing a given filter level
gen.z.mat<-dp.matrix[1-miss >= i,]
#calc the total number of snps retained at the given filter level
snps.retained<-nrow(gen.z.mat)
#calculate the total missing data % for the given cutoff
missingness<-sum(is.na(gen.z.mat))/(ncol(gen.z.mat)*nrow(gen.z.mat))
#calculate the completeness cutoff for each snp to be retained
filt<-i
df.x<-rbind(df.x, as.data.frame(cbind(filt, missingness, snps.retained)))
}
#make columns numeric for plotting
df.x$filt<-as.numeric(as.character(df.x$filt))
df.x$missingness<-as.numeric(as.character(df.x$missingness))
df.x$snps.retained<-as.numeric(as.character(df.x$snps.retained))
#visualize dotplot for total loci retained at each filter level
plot1<-ggplot(df.x, aes(x=filt)) +
scale_x_continuous(breaks=c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1))+
geom_point(aes(y=snps.retained)) +
theme_bw() +
labs(x = "SNP completeness cutoff", y = "total loci retained")
#visualize dotplot for missing data % at each filter level
plot2<-ggplot(df.x, aes(x=filt)) +
scale_x_continuous(breaks=c(.3,.5,.6,.65,.7,.75,.8,.85,.9,.95,1))+
geom_point(aes(y=missingness)) +
theme_bw() +
labs(x = "SNP completeness cutoff", y = "total proportion missing data")
grid.arrange(plot1,plot2)
}
}
#visualize missing data by SNP and the effect of various cutoffs on the missingness of each sample
missing.by.snp(vcfR)
## Picking joint bandwidth of 0.0674
#choose a value that retains an acceptable amount of missing data in each sample, and maximizes SNPs retained while minimizing overall missing data, and filter vcf
vcfR<-missing.by.snp(vcfR, cutoff = .85)
## [1] "89.87% of SNPs fell below a completeness cutoff of 0.85 and were removed from the VCF"
MAF cutoffs can be helpful in removing spurious and uninformative loci from the dataset, but also have the potential to bias downstream inferences. Linck and Battey (2019) have an excellent paper on just this topic. From the paper-
"We recommend researchers using model‐based programs to describe population structure observe the following best practices: (a) duplicate analyses with nonparametric methods suchas PCA and DAPC with cross validation (b) exclude singletons (c) compare alignments with multiple assembly parameters When seeking to exclude only singletons in alignments with missing data (a ubiquitous problem for reduced‐representation library preparation methods), it is preferable to filter by the count (rather than frequency) of the minor allele, because variation in the amount of missing data across an alignment will cause a static frequency cutoff to remove different SFS classes at different sites""
Our package contains a convenient wrapper functions that streamline the investigation of different MAF cutoffs. Warning: this is a heavy function if it is run without the 'min.mac' option. It is converting the entire vcf and running 6 unique dapc iterations, which will take time for large datasets. If you set 'min.mac' it will just filter your dataset, and should run quickly.
#define function
min.mac <- function(vcfR, popmap=NULL, min.mac=NULL){
if (is.null(min.mac)) {
#convert vcfR to matrix and make numeric
gt.matrix<-extract.gt(vcfR)
gt.matrix[gt.matrix == "0/0"]<-0
gt.matrix[gt.matrix == "0/1"]<-1
gt.matrix[gt.matrix == "1/1"]<-2
class(gt.matrix) <- "numeric"
#calc sfs
sfs<-rowSums(gt.matrix, na.rm = TRUE)
#fold sfs
for (i in 1:length(sfs)) {
if (sfs[i] <= sum(!is.na(gt.matrix[i,]))){}
else {
sfs[i]<-(sum(!is.na(gt.matrix[i,]))*2 - sfs[i])
}
}
#hist folded mac with cutoff shown
hist(sfs, main="folded SFS", xlab = "MAC")
#convert vcfR into genlight
genlight<-vcfR2genlight(vcfR)
#run dapc for mac 1,2,3,4,5,10
for (i in c(1,2,3,4,5,10)){
#filter genlight by given mac
genlight<-genlight[,sfs >= i]
#subset sfs vector to only samples left in the vcf
sfs<-sfs[sfs >= i]
#assign samples to the number of groups present in popmap, retain all PCAs
grp<-find.clusters(genlight, n.pca = ncol(gt.matrix)-1, n.clust = length(levels(popmap$pop)))
#check how well that assignment matched up to the provided popmap
samps<-merge(popmap, data.frame(group=grp$grp, id=labels(grp$grp)), by='id')
print(paste0("for ", i, " minimum MAC cutoff, compare k means clustering to popmap assignment"))
print(table(samps$pop, samps$group))
#run dapc, retain all discriminant axes, and enough PC axes to explain 75% of variance
dapc1<-dapc(genlight, grp$grp, n.da = length(levels(popmap$pop))-1, pca.select = "percVar", perc.pca = 75)
#plot compoplot
compoplot(dapc1, legend=FALSE, col=funky(length(levels(popmap$pop))), show.lab =TRUE, cex.names=.4, main=paste0("min. MAC ",i,", total SNPs ",length(sfs)))
#print
print(paste0("DAPC with min. MAC ", i, " and ", length(sfs), " total SNPs, complete"))
}
}
else {
#convert vcfR to matrix and make numeric
gt.matrix<-extract.gt(vcfR)
gt.matrix[gt.matrix == "0/0"]<-0
gt.matrix[gt.matrix == "0/1"]<-1
gt.matrix[gt.matrix == "1/1"]<-2
class(gt.matrix) <- "numeric"
#calc sfs
sfs<-rowSums(gt.matrix, na.rm = TRUE)
#fold sfs
for (i in 1:length(sfs)) {
if (sfs[i] <= sum(!is.na(gt.matrix[i,]))){}
else {
sfs[i]<-(sum(!is.na(gt.matrix[i,]))*2 - sfs[i])
}
}
#hist folded mac with cutoff shown
hist(sfs, main="folded SFS", xlab = "MAC")
abline(v=min.mac-1, col="red")
#calculate % of SNPs to be removed, and print it
p<-round((sum(sfs < min.mac)/length(sfs))*100, 2)
print(paste0(p, "% of SNPs fell below a minor allele count of ", min.mac, " and were removed from the VCF"))
#filter vcfR
vcfR <- vcfR[sfs >= min.mac,]
return(vcfR)
}
}
#use min.mac() to investigate the effect of multiple cutoffs
min.mac(vcfR = vcfR, popmap = popmap)
## [1] "for 1 minimum MAC cutoff, compare k means clustering to popmap assignment"
##
## 1 2 3 4 5
## californi 0 1 32 0 0
## coerulesc 0 0 0 6 0
## insularis 0 0 0 0 8
## sumichras 11 1 0 0 0
## woodhouse 0 42 0 0 0
## [1] "DAPC with min. MAC 1 and 17799 total SNPs, complete"
## [1] "for 2 minimum MAC cutoff, compare k means clustering to popmap assignment"
##
## 1 2 3 4 5
## californi 32 1 0 0 0
## coerulesc 0 0 0 0 6
## insularis 0 0 8 0 0
## sumichras 0 1 0 11 0
## woodhouse 0 42 0 0 0
## [1] "DAPC with min. MAC 2 and 11687 total SNPs, complete"
## [1] "for 3 minimum MAC cutoff, compare k means clustering to popmap assignment"
##
## 1 2 3 4 5
## californi 0 0 1 0 32
## coerulesc 6 0 0 0 0
## insularis 0 8 0 0 0
## sumichras 0 0 1 11 0
## woodhouse 0 0 42 0 0
## [1] "DAPC with min. MAC 3 and 8609 total SNPs, complete"
## [1] "for 4 minimum MAC cutoff, compare k means clustering to popmap assignment"
##
## 1 2 3 4 5
## californi 0 0 0 32 1
## coerulesc 0 6 0 0 0
## insularis 0 0 8 0 0
## sumichras 11 0 0 0 1
## woodhouse 0 0 0 0 42
## [1] "DAPC with min. MAC 4 and 7006 total SNPs, complete"
## [1] "for 5 minimum MAC cutoff, compare k means clustering to popmap assignment"
##
## 1 2 3 4 5
## californi 0 1 0 0 32
## coerulesc 0 0 6 0 0
## insularis 0 0 0 8 0
## sumichras 11 1 0 0 0
## woodhouse 0 42 0 0 0
## [1] "DAPC with min. MAC 5 and 5865 total SNPs, complete"
## [1] "for 10 minimum MAC cutoff, compare k means clustering to popmap assignment"
##
## 1 2 3 4 5
## californi 0 0 32 0 1
## coerulesc 0 0 0 6 0
## insularis 0 8 0 0 0
## sumichras 11 0 0 0 1
## woodhouse 0 0 0 0 42
## [1] "DAPC with min. MAC 10 and 3380 total SNPs, complete"
#based on these visualizations, I will remove singletons from the dataset, as it doesn't affect group inference
vcfR<-min.mac(vcfR, min.mac = 2)
## [1] "37.29% of SNPs fell below a minor allele count of 2 and were removed from the VCF"
check once more to see how many SNPs and individuals remain compared to our original, unfiltered vcf
vcfR
## ***** Object of Class vcfR *****
## 101 samples
## 37 CHROMs
## 11,687 variants
## Object size: 82.8 Mb
## 6.466 percent missing data
## ***** ***** *****
#plot depth per snp and per sample
dp <- extract.gt(vcfR, element = "DP", as.numeric=TRUE)
heatmap.bp(dp, rlabels = FALSE)
#plot genotype quality per snp and per sample
gq <- extract.gt(vcfR, element = "GQ", as.numeric=TRUE)
heatmap.bp(gq, rlabels = FALSE)
We can use the convenient function 'write.vcf' from vcfR to export our filtered vcf file for downstream analyses
#write.vcf(vcfR, "~/Desktop/aph.data/filtered.vcf.gz")
If you need physically unlinked loci (which is a requirement of some programs, e.g. structure) this filtering step should always be done last, because it is not quality aware. Introducing a quality-blind linkage filter before doing the quality based filtering shown here would potentially remove quality SNPs while leaving us with only the low quality SNPs in a locus or genomic region. If filtering for linkage is needed, it can be done on our output vcf file with a simple one-liner via vcftools (set thin to whatever bp distance you assume linakge decays at in your study organism) vcftools --vcf vcf.vcf --thin 10000 --recode --out vcf